import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import *
import plotly.express as px
hr_dataset = pd.read_csv("HR_comma_sep (1).csv")
hr_dataset
hr_dataset.shape
# Lets first check if any rows/columns have any missing values
def display_missing(df):
for col in df.columns.tolist():
print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
print('\n')
display_missing(hr_dataset)
# we realise the sales column is named incorrectly, perhaps a trick to test my attention! I will rename this to "department"
# and monthly has been spelled wrong
hr_dataset.rename(columns = {'sales':'department','average_montly_hours':'average_monthly_hours'}, inplace = True)
hr_dataset
# No missing values, so we can progress
# Lets have a quick look into some stats of the data
hr_dataset.describe()
hr_dataset.corr()
# the strongest reasons for leaving are correlated with satisfaction, time_spend_company and work accidents
ppp = hr_dataset.corr()['left'].reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'})
ppp = ppp.loc[hr_dataset.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
ppp
# lets label encode salary column since its ordinal
# lets onehotencode department colunn since they are not related to each other
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
hr_dataset_encoded = hr_dataset.copy()
hr_dataset_encoded["salary"] = le.fit_transform(hr_dataset_encoded["salary"])
hr_dataset_encoded = pd.concat([hr_dataset_encoded, pd.get_dummies(hr_dataset_encoded['department'], prefix = "department")], axis = 1).drop(['department'],axis = 1)
hr_dataset_encoded
hr_dataset_encoded.describe().iloc[1:2,].melt().rename(columns = ({'value':'mean'}))
hr_dataset_encoded.corr()
# we can now create a correlation matrix
fig, axes = plt.subplots(1, 1, figsize=(18, 10))
sns.heatmap(hr_dataset_encoded.corr(), annot=True).set_title('Correlation matrix for HR dataset')
plt.show()
# the variables ranked by absolute value
hr_dataset_encoded.corr()['left'].reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
hr_dataset.groupby(['left'])['left'].count().reset_index(name = 'Employee count')
sns.countplot(x="left", data=hr_dataset)
fig, ax = plt.subplots()
fig.set_size_inches(5,5)
model_bar = sns.countplot(x="left", data = hr_dataset)
plt.title("How many employees have stayed and left")
for p in model_bar.patches:
model_bar.annotate(p.get_height(),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
size=15,
xytext = (0, -35),
textcoords = 'offset points')
time_spend_count = hr_dataset.groupby(['time_spend_company','left'])['left'].count().reset_index(name = 'Employee count')
time_spend_count
sns.barplot(x = 'time_spend_company', y = 'Employee count', data = time_spend_count, hue = 'left')
number_project = hr_dataset.groupby(['number_project','left'])['left'].count().reset_index(name = 'Employee count')
number_project
sns.barplot(x = 'number_project', y = 'Employee count', data = number_project, hue = 'left')
# we can see those on 2,6,7 project counts are more likely to leave than stay
salary_count = hr_dataset.groupby(['salary','left'])['left'].count().reset_index(name = 'Employee count')
salary_count
sns.barplot(x = 'salary', y = 'Employee count', data = salary_count, hue = 'left')
px.sunburst(salary_count, path = ['salary','left'], values='Employee count')
# low has highest churn rate, and high has lowest churn rate
# Lets make a pie chart to representing department employee splits
import plotly.express as px
pie = px.pie(hr_dataset, values= hr_dataset['department'].value_counts() , names= hr_dataset['department'].value_counts().index, title='Employee pie chart by department',labels={'value':'employees'})
pie.update_traces(textposition='inside', textinfo='percent+label')
pie.show()
department_count = hr_dataset.groupby(['department','left'])['left'].count().reset_index(name = 'Employee count')
department_count
fig, ax = plt.subplots()
fig.set_size_inches(20,10)
model_bar = sns.barplot(x = 'department', y = 'Employee count', data = department_count, hue = 'left')
plt.title("How many employees have stayed and left by department")
for p in model_bar.patches:
model_bar.annotate('{:,.0f}'.format(p.get_height()),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
size=15,
xytext = (0, -10),
textcoords = 'offset points')
# indeed we conclude accounting, hr and technical have the highest leaving rates.
# we can see most people who left came from sales follwed by technical, however this is to be expected as these are the largest departmets
# the worst employee left rate is at hr
# lets find the highest leaving rates within departments
percentage_split_departments = hr_dataset.groupby(['department','left'])['left'].count().groupby(level=0).apply(lambda x: 100*x/x.sum())\
.reset_index(name='Leaving percentage').round(2)
percentage_split_departments[percentage_split_departments['left']==1].sort_values(by =['Leaving percentage'],ascending = False)
# hr has the highest leaving rate
box = px.box(hr_dataset, x="left", y="satisfaction_level", color = "left", color_discrete_sequence=px.colors.qualitative.Set1)
box.show()
# clearly those who left we're more unsatsified at work, median 0.41 compared to 0.69
pd.crosstab(hr_dataset['satisfaction_level'],hr_dataset['left']).plot(kind="bar",figsize=(20,5))
plt.title('Employee churn count against Satisfaction level')
plt.xlabel('Satisfaction level ')
plt.ylabel('Count')
plt.show()
fig, axes = plt.subplots(3, 3, figsize=(18, 10))
fig.suptitle('Satisfaction level stripplots')
sns.stripplot(ax=axes[0, 0], data=hr_dataset[hr_dataset['left']==1], x='left', y='satisfaction_level', color = 'r')
sns.stripplot(ax=axes[0, 1], data=hr_dataset[hr_dataset['left']==1], x='last_evaluation', y='satisfaction_level')
sns.stripplot(ax=axes[0, 2], data=hr_dataset[hr_dataset['left']==1], x='number_project', y='satisfaction_level')
sns.stripplot(ax=axes[1, 0], data=hr_dataset[hr_dataset['left']==1], x='average_monthly_hours', y='satisfaction_level')
sns.stripplot(ax=axes[1, 1], data=hr_dataset[hr_dataset['left']==1], x='time_spend_company', y='satisfaction_level')
sns.stripplot(ax=axes[1, 2], data=hr_dataset[hr_dataset['left']==1], x='Work_accident', y='satisfaction_level')
sns.stripplot(ax=axes[2, 0], data=hr_dataset[hr_dataset['left']==1], x='promotion_last_5years', y='satisfaction_level')
sns.stripplot(ax=axes[2, 1], data=hr_dataset[hr_dataset['left']==1], x='department', y='satisfaction_level')
sns.stripplot(ax=axes[2, 2], data=hr_dataset[hr_dataset['left']==1], x='salary', y='satisfaction_level')
# from this we are able to see there are 3 groups the client must focus on to avoid losing them
# first cluster left due to extremely unsatisfied, seemingly from being overworked at with 5+ projects and higher average hours than the others
# these guys tend to be around 4/5 years with the company and have not been promoted despite their hard work
# they have a high evaluation score, indeeed the company is overworking great employees
# 2nd cluster aren't so happy, with a lot of them having lower average monthly hours than others
# they have low evaluation scores, many are on their second project and have been with the copmany 3/4 years
# perhpas they feel the lack of growth and the lack of hours isnt paying them enough
# 3rd cluster are very satisfied, so it's interesting to learn why they have left
# they had very high evaluation scores, were around 4/5 projects in
# they have a fairer amount of average monthly values, as compared the cluster 1 who were simialr but unsatsified
# but they were underpayed, with many of them low or medium salary suggesting they left to a company who payed more
# lets look further at these 3 groups
# the first cluster is below 0.15 in satisfaction level
cluster = hr_dataset[(hr_dataset['satisfaction_level']<0.15)]
cluster
# the average number of projects here is 5.9, more than the 3.8 company average
# the average monthly hours here is 266.00, more than the 200.1 monthly average of other employees
cluster.describe().iloc[:,[2,3]]
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.15)].corr()['left'].reset_index()\
.rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.15)]\
.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
# (I created a dataframe of the ranked correlations with left by absolute values)
# these include the departments and salaries now as well, though they do not seem to be that important anyway to reasons for leaving
# within this region of employees, we see the reason many left is due to a low satisfaction level resulting from higher than\
# average monthly hours and having a high number of projects, which is what we saw before
# clearly working them less hours and projects would make them happier here
fig, axes = plt.subplots(2, 2, figsize=(18, 10))
fig.suptitle('Countplots of employees in region 1')
sns.countplot(ax=axes[0, 0],x="left", data=cluster)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster, hue = 'left')
sns.countplot(ax=axes[1, 0],x="average_monthly_hours", data=cluster, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster, hue = 'left')
# now we look at the second cluster group between 0.35 and 0.5 satisfaction levels
cluster2 = hr_dataset[(hr_dataset['satisfaction_level']<0.5)&(hr_dataset['satisfaction_level']>0.35)]
cluster2
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.5)&(hr_dataset_encoded['satisfaction_level']>0.35)].corr()['left']\
.reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'}) \
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']<0.5)&(hr_dataset_encoded['satisfaction_level']>0.35)]\
.corr()['left'].reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
# perhaps the lack of growth and low evaluation scores at around 3 year mark suggested these lower performing ex employees\
# sensed no progress at the company or potentially were even let go
fig, axes = plt.subplots(2, 2, figsize=(18, 10))
fig.suptitle('Countplots of employees in region 2')
sns.countplot(ax=axes[0, 0],x="left", data=cluster2)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster2, hue = 'left')
sns.countplot(ax=axes[1, 0],x="average_monthly_hours", data=cluster2, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster2, hue = 'left')
cluster3 = hr_dataset[(hr_dataset['satisfaction_level']> 0.7)]
cluster3
hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']> 0.7)].corr()['left']\
.reset_index().rename(columns = {'index': 'variable', 'left':'left correlation'})\
.loc[hr_dataset_encoded[(hr_dataset_encoded['satisfaction_level']> 0.7)].corr()['left']\
.reset_index().left.abs().argsort().values[::-1][:11]].iloc[1:,:]
# for this dataset we see time_spend_company is the main reason they moved on
# interestingly this group was very happy and high performing
fig, axes = plt.subplots(2, 2, figsize=(18, 10))
fig.suptitle('Countplots of employees in region 3')
sns.countplot(ax=axes[0, 0],x="left", data=cluster3)
sns.countplot(ax=axes[0, 1],x="number_project", data=cluster3, hue = 'left')
sns.countplot(ax=axes[1, 0],x="last_evaluation", data=cluster3, hue = 'left')
sns.countplot(ax=axes[1, 1],x="time_spend_company", data=cluster3, hue = 'left')
# lets look at the spike in plot 3 to better understand
cluster4 = hr_dataset[(hr_dataset['last_evaluation']> 0.99)]
cluster4
sns.countplot(x="left", data=cluster4)
sns.countplot(x="salary", data=cluster4, hue = 'left')
# from these plots we gather no one with a high salary left in this cluster and with a evaluation score of 1
hr_dataset_encoded
X = hr_dataset_encoded.drop(['left'],axis = 1).values
y = hr_dataset_encoded['left'].values
# test train split our model, with a 80:20 split between the training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train[0] # we can see some variables are above the 0 to 1 range that most of the others are
# lets scale the model now, only number_project, average_monthly_hours, time_spend_company and salary need to be scaled\
# as they are not in a 0 to 1 range
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,[2,3,4,7]] = sc.fit_transform(X_train[:,[2,3,4,7]])
X_test[:,[2,3,4,7]] = sc.fit_transform(X_test[:,[2,3,4,7]])
X_train[0]
# we can begin testing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test, y_pred)
print(cm_dtc)
acc_dtc = accuracy_score(y_test, y_pred) * 100
acc_dtc
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
cm_svc = confusion_matrix(y_test, y_pred)
print(cm_svc)
acc_svc = accuracy_score(y_test, y_pred)*100
acc_svc
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm_ksvc = confusion_matrix(y_test, y_pred)
print(cm_svc)
acc_ksvc = accuracy_score(y_test, y_pred) *100
acc_ksvc
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred)
print(cm_lr)
acc_lr = accuracy_score(y_test, y_pred)*100
acc_lr
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
cm_nb = confusion_matrix(y_test, y_pred)
print(cm_nb)
acc_nb = accuracy_score(y_test, y_pred)*100
acc_nb
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
cm_rfc = confusion_matrix(y_test, y_pred)
print(cm_rfc)
acc_rfc = accuracy_score(y_test, y_pred)*100
acc_rfc
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)
knc.fit(X_train, y_train)
y_pred = knc.predict(X_test)
cm_knc = confusion_matrix(y_test, y_pred)
print(cm_knc)
acc_knc = accuracy_score(y_test, y_pred)*100
acc_knc
# lets create a dataframe to represent these
values = {'Model': ['Decision Tree','Linear SVC', 'RBF SVC', 'Logistic Regression', 'Naive Bayes', ' Random Forest', ' KNeighbors'],
'Accuracy %': [acc_dtc,acc_svc,acc_ksvc,acc_lr,acc_nb,acc_rfc,acc_knc]}
model_accuracy_df = pd.DataFrame(values, columns = ['Model', 'Accuracy %'])
model_accuracy_df['Accuracy %'] = model_accuracy_df['Accuracy %'].round(2)
model_accuracy_df
fig, ax = plt.subplots()
fig.set_size_inches(16,5)
model_bar = sns.barplot(x="Model", y="Accuracy %", data = model_accuracy_df)
plt.title("Accuracies of each model")
for p in model_bar.patches:
model_bar.annotate('{:.2f}%'.format(p.get_height()),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
size=15,
xytext = (0, -15),
textcoords = 'offset points')
# we see that random forest has the best accuracy
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Confusion Matrix for each Model')
sns.heatmap(cm_dtc, ax = axes[0,0],annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Decision Tree Model')
sns.heatmap(cm_svc, ax = axes[0,1], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Linear SVC Model')
sns.heatmap(cm_ksvc, ax = axes[0,2],annot=True, cbar=False , cmap="Greens", fmt="d").set_title('RBF SVC Model')
sns.heatmap(cm_lr, ax = axes[1,0], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Linear Regression Model')
sns.heatmap(cm_rfc, ax = axes[1,1], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Random Forest Model')
sns.heatmap(cm_knc, ax = axes[1,2], annot=True, cbar=False, cmap="Greens", fmt="d").set_title('KNeighbours Model')
# we find random forest to be the best model, with an accuracy of 99.27% after tuning the hyperparameters
sns.heatmap(cm_rfc, annot=True, cbar=False, cmap="Greens", fmt="d").set_title('Random Forest Confusion Matrix')
hr_dataset_analysis.to_csv('hr_analysis.csv')